### function to extract the domain ####
domain <- function(x) strsplit(gsub("http://|https://|www\\.", "", x), "/")[[c(1, 1)]]
url_pattern <- "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"

require(data.table)
require(stringr)
require(plyr)
require(igraph)

# load resolved urls for tweets created by 5000 most active political retweeters during GEN 2018
resolved_full <- readRDS("./resolved_urls_feb.rds")
resolved_full$orig_url <- tolower(resolved_full$orig_url)
resolved_full$domain <- tolower(apply(as.data.frame(resolved_full$expanded_url), 1, function(x) domain(x)))

# load MINE2018 url analysis table (ulrs with respective facebook engagement)
df <- readRDS("./mine_dataset.rds")

url_analysis.df <- data.table(df)
rm(df)

url_analysis.df$title_description <- NULL

mine_source_analysis <- as.data.table(url_analysis.df[,
                                        .(.N, mean(max_reaction_count), mean(max_comment_count), mean(max_share_count), 
                                              sd(max_reaction_count), sd(max_comment_count), sd(max_share_count),
                                              sum(max_reaction_count), sum(max_comment_count), sum(max_share_count)),
                                        by = domain])

names(mine_source_analysis) <- c("domain", "urls", "AVG_reaction_count", "AVG_comment_count", "AVG_share_count", "STD_reaction_count", "STD_comment_count", "STD_share_count", "SUM_reaction_count", "SUM_comment_count", "SUM_share_count")
mine_source_analysis$engagement <- (mine_source_analysis$AVG_reaction_count + mine_source_analysis$AVG_comment_count + mine_source_analysis$AVG_share_count)

# load full dataset of tweets created during FEB-MARCH-04 2018 by TOP 5000 Italian political retweeters of GEN2018
t <- fread("./Tweets_created_by_top_5000_febmarch.csv",sep = ",", header = T)
t2 <- t[grep("http", t$text),]
t2$orig_url <- tolower(str_extract(t2$text, url_pattern))

rm(t)

# reconcile tweets with expanded urls 
data_domains <- join(t2, resolved_full, by = "orig_url")

# clean up
data_domain_small <- data.frame(username=data_domains$from_user_name,domain=data_domains$domain)
rm(data_domains, t2, resolved_full)

# remove platforms domains
data_domain_small <- subset(data_domain_small,subset = data_domain_small$domain != "twitter.com")
data_domain_small <- subset(data_domain_small,subset = data_domain_small$domain != "t.co")
data_domain_small <- subset(data_domain_small,subset = data_domain_small$domain != "facebook.com")
data_domain_small <- subset(data_domain_small,subset = data_domain_small$domain != "youtube.com")
data_domain_small <- subset(data_domain_small,subset = data_domain_small$domain != "m.facebook.com")
data_domain_small <- subset(data_domain_small,subset = data_domain_small$domain != "instagram.com")
data_domain_small <- subset(data_domain_small,subset = data_domain_small$domain != "paper.li")
data_domain_small <- subset(data_domain_small,subset = data_domain_small$domain != "soundcloud.com")
data_domain_small <- subset(data_domain_small,subset = data_domain_small$domain != "open.spotify.com")
data_domain_small <- subset(data_domain_small,subset = data_domain_small$domain != "spreaker.com")
data_domain_small <- subset(data_domain_small,subset = data_domain_small$domain != "issuu.com")
data_domain_small <- subset(data_domain_small,subset = data_domain_small$domain != "feeds.feedburner.com")
data_domain_small <- subset(data_domain_small,subset = data_domain_small$domain != "vimeo.com")
data_domain_small <- subset(data_domain_small,subset = data_domain_small$domain != "msn.com")
data_domain_small <- subset(data_domain_small,subset = !(data_domain_small$domain %in% as.vector(unique(data_domain_small[grep("google", data_domain_small$domain), "domain"])))) # remove Google related domains 

top_domain <- as.data.frame(table(data_domain_small$domain))
top_domain <- top_domain[with(top_domain, order(-Freq)),]
top_domain <- subset(top_domain, top_domain$Freq>mean(top_domain$Freq)) 
                     
data_domain_small <- data_domain_small[data_domain_small$domain %in% top_domain$Var1,]

mine_source_analysis$domain <- tolower(apply(as.data.frame(mine_source_analysis$domain), 1, function(x) domain(x))) # clean up www.

mine2018_domains <- intersect(unique(data_domain_small$domain),unique(mine_source_analysis$domain))
data_domain_small <- data_domain_small[data_domain_small$domain %in% mine2018_domains,]
data_domain_small$domain <- droplevels(data_domain_small$domain)

saveRDS(data_domain_small, file = "./users_domains_feb.rds")
saveRDS(mine_source_analysis, file = "./source_analysis_feb.rds")

#### network ####
g <- graph.data.frame(data_domain_small,directed = T)
E(g)$weight <- 1 
g <- simplify(g, edge.attr.comb=list(weight="sum")) 
E(g)$weight_n <- E(g)$weight/max(E(g)$weight) 


V(g)$outdegree <- degree(g,mode = "out")
V(g)$indegree <- degree(g,mode = "in")
V(g)$degree <- degree(g,mode = "all")

media <- data.frame(name=V(g)$name,indegree=V(g)$indegree)
media <- media[with(media, order(-indegree)),]
media <- media[media$indegree > 0,]

users <- data.frame(name=V(g)$name,outdegree=V(g)$outdegree)
users <- users[with(users, order(-outdegree)),]
users <- users[users$outdegree > 0,]

V(g)$type[V(g)$indegree > 0] <- TRUE
V(g)$type[V(g)$indegree == 0] <- FALSE

g_domain <- g


write.graph(g,"g.graphml",format = "graphml")

small_net <- induced.subgraph(g,V(g)[V(g)$indegree > 100])

bipartite.projection.size(small_net)
g_b <- bipartite.projection(small_net)

write.graph(g_b$)












